Import library

library(dada2)
packageVersion("dada2") # check dada2 version
## [1] '1.18.0'
library(Biostrings)
library(ShortRead)
library(seqTools) # per base sequence content
library(phyloseq)
library(ggplot2)
library(data.table)
library(plyr)
library(dplyr)
library(qckitfastq) # per base sequence content
library(stringr)

QUALITY CHECK

First, we import the fastq files containing the raw reads. The samples were downloaded from the SRA database with the accession number PRJEB11419. IBS and healthy samples were selected out of the thousands of samples as described in 00_Metadata-AGP.R.

# Save the path to the directory containing the fastq zipped files
path <- "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP"
# list.files(path) # check we are in the right directory

# fastq filenames have format: SAMPLENAME.fastq.gz
# Saves the whole directory path to each file name
FNs <- sort(list.files(file.path(path, "original"), pattern=".fastq.gz", full.names = TRUE))
show(FNs[1:5])
## [1] "/Users/scarcy/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/original/ERR1073039_1.fastq.gz"
## [2] "/Users/scarcy/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/original/ERR1073655_1.fastq.gz"
## [3] "/Users/scarcy/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/original/ERR1073725_1.fastq.gz"
## [4] "/Users/scarcy/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/original/ERR1074570_1.fastq.gz"
## [5] "/Users/scarcy/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/original/ERR1074573_1.fastq.gz"
# Extract sample names, assuming filenames have format: SAMPLENAME.fastq
sample.names <- sapply(strsplit(basename(FNs), "_"), `[`, 1)
show(sample.names[1:5]) # saves only the file name (without the path)
## [1] "ERR1073039" "ERR1073655" "ERR1073725" "ERR1074570" "ERR1074573"
# Look at quality of all files
for (i in 1:4){ # 1:length(FNs)
  show(plotQualityProfile(FNs[i]))
}

# Look at nb of reads per sample
# raw_stats <- data.frame('sample' = sample.names,
#                         'reads' = fastqq(FNs)@nReads)
min(raw_stats$reads)
## [1] Inf
max(raw_stats$reads)
## [1] -Inf
mean(raw_stats$reads)
## [1] NA
# hist(log10(raw_stats$reads))

We will have a quick peak at the per base sequence content of the reads in some samples, to make sure there is no anomaly (i.e. all reads having the same sequence).

# Look at per base sequence content
fseq <- seqTools::fastqq(FNs[4])
## [fastqq] File ( 1/1) '/Users/scarcy/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/original/ERR1074570_1.fastq.gz'  done.
rc <- read_content(fseq)
plot_read_content(rc) + labs(title = "Per base sequence content")

plot_read_content(rc) + xlim(0,50) + labs(title = "Per base sequence content")

LOOK FOR PRIMERS

Now, we will look whether the reads still contain the primers.

# Import metadata, that contains the primer sequences
metadata <- read.csv(file.path(path, "Metadata-AGP.csv"))
table(metadata$pcr_primers..exp.)
## 
## FWD:GTGCCAGCMGCCGCGGTAA; REV:GGACTACHVGGGTWTCTAAT 
##                                                45 
## FWD:GTGYCAGCMGCCGCGGTAA; REV:GGACTACNVGGGTWTCTAAT 
##                                              1245
# V4
FWD <- "GTGYCAGCMGCCGCGGTAA"  # 515F primer sequence
REV <- "GGACTACHVGGGTWTCTAAT" # 806R primer sequence

# Function that, from the primer sequence, will return all combinations possible (complement, reverse complement, ...)
allOrients <- function(primer) {
    # Create all orientations of the input sequence
    require(Biostrings)
    dna <- DNAString(primer)  # The Biostrings works w/ DNAString objects rather than character vectors
    orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna), 
        RevComp = reverseComplement(dna))
    return(sapply(orients, toString))  # Convert back to character vector
}

# Get all combinations of the primer sequences
FWD.orients <- allOrients(FWD) # 515F
REV.orients <- allOrients(REV) # 806R
FWD.orients # sanity check
##               Forward            Complement               Reverse 
## "GTGYCAGCMGCCGCGGTAA" "CACRGTCGKCGGCGCCATT" "AATGGCGCCGMCGACYGTG" 
##               RevComp 
## "TTACCGCGGCKGCTGRCAC"
REV.orients
##                Forward             Complement                Reverse 
## "GGACTACHVGGGTWTCTAAT" "CCTGATGDBCCCAWAGATTA" "TAATCTWTGGGVHCATCAGG" 
##                RevComp 
## "ATTAGAWACCCBDGTAGTCC"
# Function that counts number of reads in which a sequence is found
primerHits <- function(primer, fn) {
    # Counts number of reads in which the primer is found
    nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE, max.mismatch = 2)
    return(sum(nhits > 0))
}

# Get a table to know how many times the 515F and 806R primers are found in the reads of each sample
for (i in 1:4){ # 1:length(FNs)
  cat("SAMPLE", sample.names[i], "with total number of", raw_stats[i,'reads'], "reads\n\n")
  x <-  rbind(ForwardPrimer = sapply(FWD.orients, primerHits, fn = FNs[[i]]), 
              ReversePrimer = sapply(REV.orients, primerHits, fn = FNs[[i]]))
  print(x)
  cat("\n____________________________________________\n\n")
}
## SAMPLE ERR1073039 with total number of reads
## 
##               Forward Complement Reverse RevComp
## ForwardPrimer       0          0       0       0
## ReversePrimer       0          0       0       4
## 
## ____________________________________________
## 
## SAMPLE ERR1073655 with total number of reads
## 
##               Forward Complement Reverse RevComp
## ForwardPrimer       0          0       0       0
## ReversePrimer       0          0       0      33
## 
## ____________________________________________
## 
## SAMPLE ERR1073725 with total number of reads
## 
##               Forward Complement Reverse RevComp
## ForwardPrimer       0          0       0       0
## ReversePrimer       0          0       0       8
## 
## ____________________________________________
## 
## SAMPLE ERR1074570 with total number of reads
## 
##               Forward Complement Reverse RevComp
## ForwardPrimer       5          0       0       0
## ReversePrimer       2          0       0    1600
## 
## ____________________________________________

FILTER AND TRIM

1. Primer removal

The reads do not contain any primer. We will go forward directly with the quality filtering.

2. Quality filtering

Then, we perform a quality filtering of the reads.

# Place filtered files in a filtered/ subdirectory
filt2_samples <- file.path(path, "filtered2", paste0(sample.names, "_filt.fastq.gz"))
# Assign names for the filtered fastq.gz files
names(filt2_samples) <- sample.names

out2 <- filterAndTrim(fwd = FNs, filt = filt2_samples,
                      maxEE=4, # reads with more than 4 expected errors (sum(10e(-Q/10))) are discarded
                      truncQ=10, # Truncate reads at the first instance of a quality score less than or equal to truncQ.
                      minLen=150, # Discard reads shorter than 200 bp. This is done after trimming and truncation.
                      compress=TRUE, # Output files in gzip format
                      verbose=TRUE) # give a message saying the percentage of filtered sequences

# Update list of filtered files
# FNs_filt <- file.path(path, "filtered2", paste0(sample.names, "_filt.fastq.gz"))
FNs_filt <- sort(list.files(file.path(path, "filtered2"), pattern=".fastq.gz", full.names = TRUE))
sample.names_filt <- sapply(strsplit(basename(FNs_filt), "_"), `[`, 1)
names(FNs_filt) <- sample.names_filt

About ~35 samples had a large number of reads but none of them passed the quality filter. Let’s inspect quickly the quality profile of some of them.

for (i in 195:234){ # printing 2 samples before & after the "abnormal samples", that actually passed the quality filter (as comparison)
  show(plotQualityProfile(FNs[i]))
}

The quality profile of these discarded samples show some abnormalities, and reads were also shorter than 150bp. It is not surprising none of the reads passed the quality filtering.

Let’s look at the output filtered fastq files as sanity check.

out2[1:4,] # show how many reads were filtered in each file
##                       reads.in reads.out
## ERR1073039_1.fastq.gz    17661     16321
## ERR1073655_1.fastq.gz    53710     53268
## ERR1073725_1.fastq.gz    26291     26120
## ERR1074570_1.fastq.gz    27191     24237
# Look at quality profile of filtered files
for (i in 1:4){
  show(plotQualityProfile(filt2_samples[i]))
}

LEARN ERROR RATES

Now we will build the parametric error model, to be able to infer amplicon sequence variants (ASVs) later on.

err <- learnErrors(FNs_filt, multithread=TRUE, randomize=TRUE, verbose = 1)

The error rates for each possible transition (A→C, A→G, …) are shown. Points are the observed error rates for each consensus quality score. The black line shows the estimated error rates after convergence of the machine-learning algorithm. The red line shows the error rates expected under the nominal definition of the Q-score. Here the estimated error rates (black line) are a good fit to the observed rates (points), and the error rates drop with increased quality as expected.

plotErrors(err, nominalQ = TRUE)

CONSTRUCT SEQUENCE TABLE

1. Infer sample composition

The dada() algorithm infers sequence variants based on estimated errors (previous step). Firstly, we de-replicate the reads in each sample, to reduce the computation time. De-replication is a common step in almost all modern ASV inference (or OTU picking) pipelines, but a unique feature of derepFastq is that it maintains a summary of the quality information for each dereplicated sequence in $quals.

# Prepare empty vector for the infered sequences
seq_infered <- vector("list", length(sample.names_filt))
names(seq_infered) <- sample.names_filt # names of each row will be the sample names

# Iterate through the 1242 samples
for(sampl in sample.names_filt) {
  cat("Processing:", sampl, "\n") # print which sample is being processed
  derep <- derepFastq(FNs_filt[[sampl]]) # dereplicate the reads in the sample
  seq_infered[[sampl]] <- dada(derep, err=err, multithread=TRUE) # default parameters
}
# Inspect the infered sequence variants from sample 1:5
for (i in 1:5){
  print(seq_infered[[i]])
  print("________________")
}
## dada-class: object describing DADA2 denoising results
## 164 sequence variants were inferred from 3842 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"
## dada-class: object describing DADA2 denoising results
## 127 sequence variants were inferred from 4897 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"
## dada-class: object describing DADA2 denoising results
## 270 sequence variants were inferred from 6218 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"
## dada-class: object describing DADA2 denoising results
## 213 sequence variants were inferred from 6000 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"
## dada-class: object describing DADA2 denoising results
## 318 sequence variants were inferred from 6087 input unique sequences.
## Key parameters: OMEGA_A = 1e-40, OMEGA_C = 1e-40, BAND_SIZE = 16
## [1] "________________"

2. Construct ASV table

We can now construct an amplicon sequence variant table (ASV) table, a higher-resolution version of the OTU table produced by traditional methods.

# Make sequence table from the infered sequence variants
seqtable <- makeSequenceTable(seq_infered)

# We should have 1242 samples (1242 rows)
dim(seqtable)
## [1]  1242 27790
# Inspect distribution of sequence lengths
hist(nchar(getSequences(seqtable)), breaks = 100, xlab = "ASV length", ylab = "Number of ASVs", main="")

3. Remove chimeras

The core dada method corrects substitution and indel errors, but chimeras remain. Fortunately, the accuracy of sequence variants after denoising makes identifying chimeric ASVs simpler than when dealing with fuzzy OTUs. Chimeric sequences are identified if they can be exactly reconstructed by combining a left-segment and a right-segment from two more abundant “parent” sequences.

seqtable.nochim <- removeBimeraDenovo(seqtable, method="consensus", multithread=TRUE, verbose=TRUE)

# Check how many sequence variants we have after removing chimeras
dim(seqtable.nochim)
## [1]  1242 18431
# Check how many reads we have after removing chimeras (we should keep the vast majority of the reads, like > 80%)
sum(seqtable.nochim)/sum(seqtable)
## [1] 0.9553288

LOOK AT READS COUNT THROUGH PIPELINE

Sanity check before assigning taxonomy.

# Function that counts nb of reads
getN <- function(x) sum(getUniques(x))

# Table that will count number of reads for each process of interest (input reads, filtered reads, denoised reads, non chimera reads)
rownames(out2) <- sapply(strsplit(rownames(out2), "_"), `[`, 1)
track <- out2 %>%
  as.data.frame() %>%
  mutate(Run=rownames(out2)) %>%
  rename("input"="reads.in",
         "quality-filt"="reads.out") %>%
  left_join(sapply(seq_infered, getN) %>%
              as.data.frame %>% 
              rename(denoised=".") %>%
              mutate(Run=names(sapply(seq_infered, getN))),
            by="Run") %>%
  left_join(rowSums(seqtable.nochim) %>%
              as.data.frame %>%
              rename(nonchim=".") %>%
              mutate(Run=rownames(seqtable.nochim)),
            by="Run") %>%
  relocate(Run) %>%
  mutate("%kept_input->output"=as.integer(nonchim*100/input))

# Show final table: for each row/sample, we have shown the initial number of reads, filtered reads, denoised reads, and non chimera reads
track

ASSIGN TAXONOMY

Extensions: The dada2 package also implements a method to make species level assignments based on exact matching between ASVs and sequenced reference strains. Recent analysis suggests that exact matching (or 100% identity) is the only appropriate way to assign species to 16S gene fragments. Currently, species-assignment training fastas are available for the Silva and RDP 16S databases. To follow the optional species addition step, download the silva_species_assignment_v132.fa.gz file, and place it in the directory with the fastq files.

# Assign taxonomy (with silva v138)
taxa <- assignTaxonomy(seqtable.nochim, "/Users/scarcy/Desktop/Praktikum/Data/silva_nr_v138_train_set.fa.gz",
                       tryRC = TRUE, # try reverse complement of the sequences
                       multithread=TRUE, verbose = TRUE)

# Add species assignment
taxa <- addSpecies(taxa, "/Users/scarcy/Desktop/Praktikum/Data/silva_species_assignment_v138.fa.gz")
# Check how the taxonomy table looks like
taxa.print <- taxa 
rownames(taxa.print) <- NULL # Removing sequence rownames for display only
head(taxa.print)
##      Kingdom    Phylum           Class                 Order             
## [1,] "Bacteria" "Proteobacteria" "Gammaproteobacteria" "Enterobacterales"
## [2,] "Bacteria" "Proteobacteria" "Gammaproteobacteria" "Enterobacterales"
## [3,] "Bacteria" "Bacteroidota"   "Bacteroidia"         "Bacteroidales"   
## [4,] "Bacteria" "Bacteroidota"   "Bacteroidia"         "Bacteroidales"   
## [5,] "Bacteria" "Bacteroidota"   "Bacteroidia"         "Bacteroidales"   
## [6,] "Bacteria" "Firmicutes"     "Clostridia"          "Oscillospirales" 
##      Family               Genus                  Species    
## [1,] "Enterobacteriaceae" "Escherichia/Shigella" NA         
## [2,] "Enterobacteriaceae" "Escherichia/Shigella" NA         
## [3,] "Bacteroidaceae"     "Bacteroides"          "vulgatus" 
## [4,] "Bacteroidaceae"     "Bacteroides"          "vulgatus" 
## [5,] "Bacteroidaceae"     "Bacteroides"          "uniformis"
## [6,] "Ruminococcaceae"    "Faecalibacterium"     NA
table(taxa.print[,1]) # Show the different kingdoms
## 
##   Archaea  Bacteria Eukaryota 
##        47     16910      1076
table(taxa.print[,2]) # Show the different phyla
## 
##  Abditibacteriota   Acidobacteriota  Actinobacteriota        Ascomycota 
##                 2                27               801                 1 
##      Bacteroidota     Basidiomycota  Bdellovibrionota  Campilobacterota 
##              2396                 9                 1                30 
##       Chloroflexi     Crenarchaeota     Cyanobacteria      Deinococcota 
##                 8                 6               143                18 
##  Desulfobacterota   Elusimicrobiota Entotheonellaeota        Euglenozoa 
##               137                 6                 1                37 
##     Euryarchaeota        Firmicutes    Fusobacteriota   Gemmatimonadota 
##                21             11373                70                 2 
##     Halobacterota   Hydrogenedentes    Incertae_Sedis Methylomirabilota 
##                 1                 1                20                 1 
##      Mucoromycota       Myxococcota      Nitrospirota   Patescibacteria 
##                 8                 6                 1                13 
##   Planctomycetota    Proteobacteria           RCP2-54     Spirochaetota 
##                17              1331                 2                10 
##      Synergistota  Thermoplasmatota Verrucomicrobiota        Vertebrata 
##                30                19               140                 1 
##             WOR-1 
##                 1
table(is.na(taxa.print[,2])) # is there any NA phyla?
## 
## FALSE  TRUE 
## 16691  1740

LAST PREPROCESSING

We will remove any sample with less than 500 reads from further analysis, and also any ASVs with unassigned phyla.

1. Create phyloseq object

The preprocessing will be easier to do with ASV, taxonomic and metadata tables combined in a phyloseq object.

#_________________________
# Import metadata
metadata <- read.csv(file.path(path, "Metadata-AGP.csv"))

# Keep relevant covariates
sampledf <- metadata %>%
  select(-"pcr_primers..exp.")
rownames(sampledf) <- sampledf$Run

sampledf$author <- "AGP"
sampledf$sequencing_tech <- "Illumina"
sampledf$variable_region <- "V4"

#_________________________
# Create phyloseq object
physeq <- phyloseq(otu_table(seqtable.nochim, taxa_are_rows=FALSE), # by default, in otu_table the sequence variants are in rows
                   sample_data(sampledf), 
                   tax_table(taxa))

# Remove samples with less than 500 reads
physeq <- prune_samples(sample_sums(physeq)>=500, physeq)
# Remove taxa that are eukaryota, or have unassigned Phyla
physeq <- subset_taxa(physeq, Kingdom != "Eukaryota")
physeq <- subset_taxa(physeq, !is.na(Phylum))
physeq <- filter_taxa(physeq, function(x) sum(x > 0) > 0, TRUE) # keep ASVs present in at least 1 sample
physeq.filt <- filter_taxa(physeq, function(x) sum(x > 0) >= 2, TRUE)

2. Save to disk

# Save to disk
saveRDS(raw_stats, "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/01_Dada2-AGP/raw_stats.rds")
saveRDS(out2, "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/01_Dada2-AGP/out2.rds")
saveRDS(err, "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/01_Dada2-AGP/err.rds")
saveRDS(seq_infered, "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/01_Dada2-AGP/seq_infered.rds")


saveRDS(otu_table(physeq), "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/01_Dada2-AGP/ASVtable_final.rds") # ASV table
saveRDS(tax_table(physeq), "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/01_Dada2-AGP/taxa_final.rds") # taxa table
saveRDS(sample_data(physeq), "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/01_Dada2-AGP/metadata_final.rds") # metadata table
write.csv(sample_data(physeq), "~/Projects/IBS_Meta-analysis_16S/analysis-individual/AGP/Metadata-AGP.csv")

saveRDS(physeq, "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/01_Dada2-AGP/physeq_AGP.rds") # phyloseq object
saveRDS(physeq.filt, "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/AGP/01_Dada2-AGP/physeq(ASVfilt)_AGP.rds")

# Save nb of reads per sample before / after dada2 pipeline
nb.reads <- readRDS("~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/nb_reads.rds")
nb.reads[1:1191, 'AGP_before'] <- fastqq(FNs)@nReads
nb.reads[1:1191, 'AGP_after'] <- rowSums(seqtable.nochim)
median(na.omit(nb.reads$AGP_before)) # sanity check
median(na.omit(nb.reads$AGP_after)) # sanity check
saveRDS(nb.reads, "~/Projects/IBS_Meta-analysis_16S/data/analysis-individual/nb_reads.rds")

3. Quick peek at data analysis

# Absolute abundance
# plot_bar(physeq, fill = "Phylum")+ facet_wrap("host_disease", scales="free") + theme(axis.text.x = element_blank()) # long to compute

# Relative abundance for Phylum
phylum.table <- physeq %>%
  tax_glom(taxrank = "Phylum") %>%                     # agglomerate at phylum level
  transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
  psmelt()                                             # Melt to long format

ggplot(phylum.table, aes(x = reorder(Sample, Sample, function(x) mean(phylum.table[Sample == x & Phylum == 'Bacteroidota', 'Abundance'])),
                         y = Abundance, fill = Phylum))+
  facet_wrap(~ host_disease, scales = "free") + # scales = "free" removes empty lines
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_blank())+
  labs(x = "Samples", y = "Relative abundance")